{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Model Complexity Influence\n\n\nDemonstrate how model complexity influences both prediction accuracy and\ncomputational performance.\n\nThe dataset is the Boston Housing dataset (resp. 20 Newsgroups) for\nregression (resp. classification).\n\nFor each class of models we make the model complexity vary through the choice\nof relevant model parameters and measure the influence on both computational\nperformance (latency) and predictive power (MSE or Hamming Loss).\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(__doc__)\n\n# Author: Eustache Diemert <eustache@diemert.fr>\n# License: BSD 3 clause\n\nimport time\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.axes_grid1.parasite_axes import host_subplot\nfrom mpl_toolkits.axisartist.axislines import Axes\nfrom scipy.sparse.csr import csr_matrix\n\nfrom sklearn import datasets\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import NuSVR\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.metrics import hamming_loss\n\n# #############################################################################\n# Routines\n\n\n# Initialize random generator\nnp.random.seed(0)\n\n\ndef generate_data(case, sparse=False):\n    \"\"\"Generate regression/classification data.\"\"\"\n    if case == 'regression':\n        X, y = datasets.load_boston(return_X_y=True)\n    elif case == 'classification':\n        X, y = datasets.fetch_20newsgroups_vectorized(subset='all',\n                                                      return_X_y=True)\n    X, y = shuffle(X, y)\n    offset = int(X.shape[0] * 0.8)\n    X_train, y_train = X[:offset], y[:offset]\n    X_test, y_test = X[offset:], y[offset:]\n    if sparse:\n        X_train = csr_matrix(X_train)\n        X_test = csr_matrix(X_test)\n    else:\n        X_train = np.array(X_train)\n        X_test = np.array(X_test)\n    y_test = np.array(y_test)\n    y_train = np.array(y_train)\n    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,\n            'y_test': y_test}\n    return data\n\n\ndef benchmark_influence(conf):\n    \"\"\"\n    Benchmark influence of :changing_param: on both MSE and latency.\n    \"\"\"\n    prediction_times = []\n    prediction_powers = []\n    complexities = []\n    for param_value in conf['changing_param_values']:\n        conf['tuned_params'][conf['changing_param']] = param_value\n        estimator = conf['estimator'](**conf['tuned_params'])\n        print(\"Benchmarking %s\" % estimator)\n        estimator.fit(conf['data']['X_train'], conf['data']['y_train'])\n        conf['postfit_hook'](estimator)\n        complexity = conf['complexity_computer'](estimator)\n        complexities.append(complexity)\n        start_time = time.time()\n        for _ in range(conf['n_samples']):\n            y_pred = estimator.predict(conf['data']['X_test'])\n        elapsed_time = (time.time() - start_time) / float(conf['n_samples'])\n        prediction_times.append(elapsed_time)\n        pred_score = conf['prediction_performance_computer'](\n            conf['data']['y_test'], y_pred)\n        prediction_powers.append(pred_score)\n        print(\"Complexity: %d | %s: %.4f | Pred. Time: %fs\\n\" % (\n            complexity, conf['prediction_performance_label'], pred_score,\n            elapsed_time))\n    return prediction_powers, prediction_times, complexities\n\n\ndef plot_influence(conf, mse_values, prediction_times, complexities):\n    \"\"\"\n    Plot influence of model complexity on both accuracy and latency.\n    \"\"\"\n    plt.figure(figsize=(12, 6))\n    host = host_subplot(111, axes_class=Axes)\n    plt.subplots_adjust(right=0.75)\n    par1 = host.twinx()\n    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])\n    y1_label = conf['prediction_performance_label']\n    y2_label = \"Time (s)\"\n    host.set_ylabel(y1_label)\n    par1.set_ylabel(y2_label)\n    p1, = host.plot(complexities, mse_values, 'b-', label=\"prediction error\")\n    p2, = par1.plot(complexities, prediction_times, 'r-',\n                    label=\"latency\")\n    host.legend(loc='upper right')\n    host.axis[\"left\"].label.set_color(p1.get_color())\n    par1.axis[\"right\"].label.set_color(p2.get_color())\n    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)\n    plt.show()\n\n\ndef _count_nonzero_coefficients(estimator):\n    a = estimator.coef_.toarray()\n    return np.count_nonzero(a)\n\n# #############################################################################\n# Main code\nregression_data = generate_data('regression')\nclassification_data = generate_data('classification', sparse=True)\nconfigurations = [\n    {'estimator': SGDClassifier,\n     'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':\n                      'modified_huber', 'fit_intercept': True, 'tol': 1e-3},\n     'changing_param': 'l1_ratio',\n     'changing_param_values': [0.25, 0.5, 0.75, 0.9],\n     'complexity_label': 'non_zero coefficients',\n     'complexity_computer': _count_nonzero_coefficients,\n     'prediction_performance_computer': hamming_loss,\n     'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)',\n     'postfit_hook': lambda x: x.sparsify(),\n     'data': classification_data,\n     'n_samples': 30},\n    {'estimator': NuSVR,\n     'tuned_params': {'C': 1e3, 'gamma': 2 ** -15},\n     'changing_param': 'nu',\n     'changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9],\n     'complexity_label': 'n_support_vectors',\n     'complexity_computer': lambda x: len(x.support_vectors_),\n     'data': regression_data,\n     'postfit_hook': lambda x: x,\n     'prediction_performance_computer': mean_squared_error,\n     'prediction_performance_label': 'MSE',\n     'n_samples': 30},\n    {'estimator': GradientBoostingRegressor,\n     'tuned_params': {'loss': 'ls'},\n     'changing_param': 'n_estimators',\n     'changing_param_values': [10, 50, 100, 200, 500],\n     'complexity_label': 'n_trees',\n     'complexity_computer': lambda x: x.n_estimators,\n     'data': regression_data,\n     'postfit_hook': lambda x: x,\n     'prediction_performance_computer': mean_squared_error,\n     'prediction_performance_label': 'MSE',\n     'n_samples': 30},\n]\nfor conf in configurations:\n    prediction_performances, prediction_times, complexities = \\\n        benchmark_influence(conf)\n    plot_influence(conf, prediction_performances, prediction_times,\n                   complexities)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}